# importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
import missingno as msno
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')
plt.style.use('fivethirtyeight')
%matplotlib inline
df = pd.read_csv('Mall_Customers.csv')
df.head()
| CustomerID | Genre | Age | Annual Income (k$) | Spending Score (1-100) | |
|---|---|---|---|---|---|
| 0 | 1 | Male | 19 | 15 | 39 |
| 1 | 2 | Male | 21 | 15 | 81 |
| 2 | 3 | Female | 20 | 16 | 6 |
| 3 | 4 | Female | 23 | 16 | 77 |
| 4 | 5 | Female | 31 | 17 | 40 |
df.describe()
| CustomerID | Age | Annual Income (k$) | Spending Score (1-100) | |
|---|---|---|---|---|
| count | 200.000000 | 200.000000 | 200.000000 | 200.000000 |
| mean | 100.500000 | 38.850000 | 60.560000 | 50.200000 |
| std | 57.879185 | 13.969007 | 26.264721 | 25.823522 |
| min | 1.000000 | 18.000000 | 15.000000 | 1.000000 |
| 25% | 50.750000 | 28.750000 | 41.500000 | 34.750000 |
| 50% | 100.500000 | 36.000000 | 61.500000 | 50.000000 |
| 75% | 150.250000 | 49.000000 | 78.000000 | 73.000000 |
| max | 200.000000 | 70.000000 | 137.000000 | 99.000000 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 200 entries, 0 to 199 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CustomerID 200 non-null int64 1 Genre 200 non-null object 2 Age 200 non-null int64 3 Annual Income (k$) 200 non-null int64 4 Spending Score (1-100) 200 non-null int64 dtypes: int64(4), object(1) memory usage: 7.9+ KB
# dropping useless column
df.drop('CustomerID', axis = 1, inplace = True)
# distribution plots
plt.figure(figsize = (20, 8))
plotnumber = 1
for col in ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']:
if plotnumber <= 3:
ax = plt.subplot(1, 3, plotnumber)
sns.distplot(df[col])
plotnumber += 1
plt.tight_layout()
plt.show()
values = df['Genre'].value_counts()
labels = ['Male', 'Female']
fig, ax = plt.subplots(figsize = (4, 4), dpi = 100)
explode = (0, 0.06)
patches, texts, autotexts = ax.pie(values, labels = labels, autopct = '%1.2f%%', shadow = True,
startangle = 90, explode = explode)
plt.setp(texts, color = 'orange')
plt.setp(autotexts, size = 12, color = 'white')
autotexts[1].set_color('black')
plt.show()
age_18_25 = df.Age[(df.Age >= 18) & (df.Age <= 25)]
age_26_35 = df.Age[(df.Age >= 26) & (df.Age <= 35)]
age_36_45 = df.Age[(df.Age >= 36) & (df.Age <= 45)]
age_46_55 = df.Age[(df.Age >= 46) & (df.Age <= 55)]
age_55above = df.Age[df.Age >= 55]
x_age = ['18-25', '26-35', '36-45', '46-55', '55+']
y_age = [len(age_18_25.values), len(age_26_35.values), len(age_36_45.values), len(age_46_55.values),
len(age_55above.values)]
px.bar(data_frame = df, x = x_age, y = y_age, color = x_age,
title = 'Number of customers per age group')
px.scatter(data_frame = df, x = 'Annual Income (k$)', y = 'Spending Score (1-100)',
title = 'Relation between Annual Income and Spending Score')
df = df.drop('Genre', axis=1)
scaler = StandardScaler()
data_scaled = scaler.fit_transform(df)
# Initialize a list to store SSE values for different K
sse = []
# Try values of K from 1 to 15
for k in range(1, 16):
kmeans = KMeans(n_clusters=k, random_state=0)
kmeans.fit(data_scaled)
sse.append(kmeans.inertia_)
# Plot the Elbow curve
plt.figure(figsize=(8, 6))
plt.plot(range(1, 16), sse, marker='o', linestyle='--')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Sum of Squared Errors (SSE)')
plt.title('Elbow Method for Optimal K')
plt.grid()
plt.show()
kmeans = KMeans(n_clusters=5, random_state=0)
df['Cluster'] = kmeans.fit_predict(data_scaled)
# Visualize the clusters
plt.scatter(df['Annual Income (k$)'], df['Spending Score (1-100)'], c=df['Cluster'], cmap='rainbow')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.title('K-means Clustering with 5 Clusters')
plt.show()
# Print the cluster centers
cluster_centers = scaler.inverse_transform(kmeans.cluster_centers_)
cluster_centers_df = pd.DataFrame(cluster_centers, columns=df.columns[:-1])
print(cluster_centers_df)
Age Annual Income (k$) Spending Score (1-100) 0 39.871795 86.102564 19.358974 1 55.638298 54.382979 48.851064 2 25.185185 41.092593 62.240741 3 46.250000 26.750000 18.350000 4 32.875000 86.100000 81.525000
plt.figure(figsize = (17, 8))
dendo = dendrogram(linkage(X3, method = 'ward'))
plt.title('Dendrogram', fontsize = 15)
plt.show()
agc = AgglomerativeClustering(n_clusters = 5, affinity = 'euclidean', linkage = 'ward')
labels = agc.fit_predict(X3)
plt.figure(figsize = (12, 8))
plt.scatter(X3[labels == 0,0], X3[labels == 0,1], label = 'Cluster 1', s = 100)
plt.scatter(X3[labels == 1,0], X3[labels == 1,1], label = 'Cluster 2', s = 100)
plt.scatter(X3[labels == 2,0], X3[labels == 2,1], label = 'Cluster 3', s = 100)
plt.scatter(X3[labels == 3,0], X3[labels == 3,1], label = 'Cluster 4', s = 100)
plt.scatter(X3[labels == 4,0], X3[labels == 4,1], label = 'Cluster 5', s = 100)
plt.legend(loc = 'best')
plt.title('Clusters of Customers\n ', fontsize = 20)
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.show()
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4,
random_state=0) # generate sample blobs
X = StandardScaler().fit_transform(X)
# DBSCAN
db = DBSCAN(eps=0.3, min_samples=10).fit(X)
# we are calculating these for showcasing in diagram
# creating an array of true and false as the same size as db.labels
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True # setting the indices of the core regions to True
labels = db.labels_ # similar to the model.fit() method, it gives the labels of the clustered data
# Number of clusters in labels, ignoring noise if present.
# the label -1 is considered as noise by the DBSCAN algorithm
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1) # calculating the number of clusters
print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)
"""Homogeneity metric of a cluster labeling given a ground truth.
A clustering result satisfies homogeneity if all of its clusters
contain only data points which are members of a single class."""
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
Estimated number of clusters: 3 Estimated number of noise points: 18 Homogeneity: 0.953
# Plot result
plt.figure(figsize = (10, 8))
# Black is used for noise.
unique_labels = set(labels) # identifying all the unique labels/clusters
colors = [plt.cm.Spectral(each)
# creating the list of colours, generating the colourmap
for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = [0, 0, 0, 1]
class_member_mask = (labels == k) # assigning class members for each class
xy = X[class_member_mask & core_samples_mask] # creating the list of points for each class
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),markeredgecolor='k', markersize=14)
xy = X[class_member_mask & ~core_samples_mask] # creating the list of noise points
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), markeredgecolor='k', markersize=14)
plt.title('Clustering using DBSCAN\n', fontsize = 15)
plt.show()